1 Overview & Preparation

The purpose of this part is to visualize all the data and assess them to help me prepare for model fitting, which is also responsible for the 1st part of final project requirements.

1.1 Package import

library(tidyverse)
library(visdat)

1.2 Data Import

df_all <- readr::read_csv("final_project_train.csv", col_names = TRUE)

df_all %>% glimpse()
## Rows: 677
## Columns: 38
## $ rowid    <dbl> 1, 3, 4, 5, 8, 9, 11, 14, 15, 16, 17, 18, 19, 22, 24, 25, 27,…
## $ region   <chr> "XX", "XX", "XX", "XX", "XX", "XX", "XX", "XX", "XX", "XX", "…
## $ customer <chr> "B", "B", "B", "B", "B", "B", "B", "B", "B", "B", "B", "B", "…
## $ xb_01    <dbl> 4.000000, 1.000000, 2.000000, 2.520000, 2.548387, 3.071429, 3…
## $ xb_02    <dbl> 4, 1, 2, 11, 6, 6, 10, 12, 9, 10, 8, 10, 10, 8, 6, 10, 13, 10…
## $ xb_03    <dbl> 4, 1, 2, -6, -1, 1, -4, -4, -2, -4, -2, -2, -2, -4, 1, -4, -3…
## $ xn_01    <dbl> 3.0000000, 2.0000000, 2.0000000, 1.5333333, 0.8387097, 1.8571…
## $ xn_02    <dbl> 3, 2, 4, 9, 3, 8, 6, 10, 10, 4, 6, 8, 9, 5, 7, 12, 12, 6, 6, …
## $ xn_03    <dbl> 3, 2, 0, -3, -4, -2, -5, -6, -3, -5, -3, -6, -4, -3, 0, -5, -…
## $ xa_01    <dbl> 12.000000, 3.000000, 9.000000, 7.080000, 6.451613, 6.857143, …
## $ xa_02    <dbl> 12, 3, 9, 29, 17, 18, 24, 27, 20, 19, 15, 24, 24, 15, 14, 26,…
## $ xa_03    <dbl> 12, 3, 9, -7, -2, 2, -9, -5, -3, -3, -1, 1, -2, -3, 3, -4, -5…
## $ xb_04    <dbl> 1.3333333, 1.0000000, 1.0000000, 0.8950476, 1.2247312, 1.1857…
## $ xb_05    <dbl> 1.3333333, 1.0000000, 1.0000000, -2.0000000, -0.5000000, 0.00…
## $ xb_06    <dbl> 1.333333, 1.000000, 1.000000, 4.000000, 4.000000, 3.000000, 6…
## $ xb_07    <dbl> 4.000000, 1.000000, 2.000000, 1.933333, 1.967742, 1.714286, 1…
## $ xb_08    <dbl> -1.00000000, 1.00000000, 0.00000000, -0.08000000, 0.35483871,…
## $ xn_04    <dbl> 1.0000000, 2.0000000, 1.0000000, 0.5268889, 0.4688172, 0.5607…
## $ xn_05    <dbl> 1.0000000, 2.0000000, 0.0000000, -1.0000000, -1.3333333, -1.0…
## $ xn_06    <dbl> 1.0, 2.0, 2.0, 2.5, 3.0, 2.0, 4.0, 4.0, 3.0, 2.0, 2.0, 2.5, 2…
## $ xn_07    <dbl> 3.000000, 2.000000, 2.500000, 1.493333, 1.225806, 1.642857, 1…
## $ xn_08    <dbl> -1.0000000, 2.0000000, -1.0000000, -0.4400000, -0.4516129, -0…
## $ xa_04    <dbl> 6.000000, 3.000000, 6.750000, 2.425333, 3.023656, 2.685714, 2…
## $ xa_05    <dbl> 6.0000000, 3.0000000, 4.5000000, -3.5000000, -0.6666667, 0.40…
## $ xa_06    <dbl> 6.000000, 3.000000, 9.000000, 9.000000, 13.000000, 6.000000, …
## $ xa_07    <dbl> 9.000000, 3.000000, 7.500000, 4.466667, 4.612903, 4.071429, 4…
## $ xa_08    <dbl> 3.0000000, 3.0000000, 6.0000000, 0.7066667, 1.3225806, 1.3571…
## $ xw_01    <dbl> 23.00000, 17.00000, 52.50000, 64.52564, 54.75758, 58.33333, 6…
## $ xw_02    <dbl> 23, 17, 48, 0, 12, 15, 0, 0, 0, 7, 14, 0, 0, 0, 8, 8, 0, 4, 2…
## $ xw_03    <dbl> 23, 17, 57, 106, 105, 101, 107, 109, 109, 104, 109, 99, 103, …
## $ xs_01    <dbl> 0.262073307, 0.330804757, 0.239795763, 0.142106837, 0.2442957…
## $ xs_02    <dbl> 0.26207331, 0.33080476, 0.19049123, -0.73321509, -0.12204299,…
## $ xs_03    <dbl> 0.2620733, 0.3308048, 0.2891003, 0.5500723, 1.3134719, 0.6540…
## $ xs_04    <dbl> 0.5375576, 0.4286607, 0.3676937, 0.2865445, 0.2375470, 0.2594…
## $ xs_05    <dbl> 0.5375575604, 0.4286607050, 0.2485001680, 0.0000000000, 0.043…
## $ xs_06    <dbl> 0.5375576, 0.4286607, 0.4868872, 0.6357541, 0.4327004, 0.8672…
## $ response <dbl> 2.617991, 1.184632, 2.216626, 2.726715, 1.483323, 2.039279, 1…
## $ outcome  <chr> "non_event", "non_event", "event", "non_event", "non_event", …

2 Exploratory Data Analysis (EDA)

2.1 General Inspection

No missing data

visdat::vis_miss(df_all)

df_all %>% purrr::map_dbl(n_distinct)
##    rowid   region customer    xb_01    xb_02    xb_03    xn_01    xn_02 
##      677        3        9      229       19       21      225       18 
##    xn_03    xa_01    xa_02    xa_03    xb_04    xb_05    xb_06    xb_07 
##       18      257       38       35      364       59       51      181 
##    xb_08    xn_04    xn_05    xn_06    xn_07    xn_08    xa_04    xa_05 
##      187      360       51       47      174      174      411       87 
##    xa_06    xa_07    xa_08    xw_01    xw_02    xw_03    xs_01    xs_02 
##       87      213      212      396      102      103      676      644 
##    xs_03    xs_04    xs_05    xs_06 response  outcome 
##      672      676      663      676      677        2

2.2 Distributions of variables.

2.2.1 Categorical variables

Q: Counts for categorical variables.

A: From the figure, it looks like very imbalanced.

df_all %>%
  count(outcome) %>%
  ggplot(mapping = aes(x = outcome, y = n))+
  geom_col()

2.2.2 Continuous variables

Q: Distributions for continuous variables. Are the distributions Gaussian like?

A: Most of them looks Gaussian like

df_con_all <- select(df_all, starts_with("x"))

df_con_all %>% 
  select(all_of(colnames(df_con_all))) %>% 
  tibble::rowid_to_column() %>% 
  pivot_longer(!c("rowid")) %>%
  ggplot()+
  geom_density(mapping = aes(x = value), adjust = 1.35, size = 0.5)+
  facet_wrap(~name, scales = "free")+
  theme_bw()+
  theme(axis.text.y = element_blank(), axis.text.x.bottom = element_blank())

2.2.3 Log-response

df_all %>% 
  ggplot(mapping = aes(x = log(response)))+
  geom_histogram(bins = 35)+
  geom_rug(alpha = 0.2) +
  theme_bw()

2.3 Affection of variables

Q: Are there differences in continuous variable distributions and continuous variable summary statistics based on region or customer?

A: Yes, if we zoom in, we do observe some differences, but not very significant.

df_trans_02 <- df_all %>%
  subset(select = -c(rowid, outcome, response)) %>% 
  pivot_longer(!c("region", "customer"))
  
df_trans_02 %>%
  ggplot(mapping = aes(x = name, color = as.factor(region)))+
  geom_density()+
  facet_wrap( ~ name, scales = "free_y")+
  theme_bw()+
  theme(axis.text.y = element_blank(), axis.text.x.bottom = element_blank())

Again, through this figure, we can see the different median of continuous variables between different region.

df_trans_02 %>%
  ggplot(mapping = aes(y = as.factor(name), x = value))+
  geom_boxplot(mapping = aes(fill = as.factor(region), color = as.factor(region)),
               alpha = 0.35, size = 0.1)+
  facet_wrap(~ name, scales = "free")+
  scale_fill_viridis_d("Region") +
  scale_color_viridis_d("Region") +
  theme_bw()+
  theme(axis.text.y = element_blank(), axis.text.x.bottom = element_blank())

Q: Are there differences in continuous variable distributions and continuous variable summary statistics based on the binary outcome?

A: Yes, if we zoom in, the differences are relative obviously

df_trans_02 %>% 
  ggplot(mapping = aes(x = name, color = customer))+
  geom_density()+
  facet_wrap( ~ name, scales = "free_y")+
  theme_bw()+
  theme(axis.text.y = element_blank(), axis.text.x.bottom = element_blank())

df_trans_02 %>%
  ggplot(mapping = aes(x = as.factor(name), y = value))+
  geom_boxplot(mapping = aes(fill = as.factor(customer), color = as.factor(customer)),
               alpha = 0.35, outlier.size = 0.1)+
  facet_wrap(~ name, scales = "free")+
  theme_bw()+
  theme(axis.text.y = element_blank(), axis.text.x.bottom = element_blank())

3 Correlation and Relationshio diagnosis

3.1 Continuous Inputs & Inputs

Q: Visualize the relationships between the continuous inputs, are they correlated?

A: Some of inputs are highly correlated to each other

df_con_all %>%
  cor() %>%
  corrplot::corrplot(type = "upper")

3.2 Continuous Inputs & Log-response

Q: Visualize the relationships between the continuous outputs (response and the log-transformed response) with respect to the continuous inputs. Can you identify any clear trends? Do the trends depend on the categorical inputs?

A: The input increase as the some of output parameters increase, like xa_01, xa_02 and xa_03. But some of them are not, and as we can see, the categorical input does have impact on the output prediction.

df_trans_04 <- df_all %>%
  mutate(log_response = log(response)) %>%
  select(starts_with('x'), log_response, response, customer, region) %>%
  pivot_longer(!c(log_response, response, customer, region))

df_trans_04 %>% count()
## # A tibble: 1 × 1
##       n
##   <int>
## 1 22341
df_trans_04 %>% 
  ggplot(mapping = aes(x = value, y = log_response, color = region))+
  geom_smooth(method = 'loess', formula = y ~ x, size = 0.4)+
  facet_wrap(~name, scales = "free")+
  theme_bw()+
  scale_color_viridis_d(option = 'plasma') +
  theme(axis.text.y = element_blank(), axis.text.x.bottom = element_blank())

df_trans_04 %>% 
  ggplot(mapping = aes(x = value, y = log_response, color = customer))+
  geom_smooth(method = 'loess', formula = y ~ x, size = 0.4)+
  facet_wrap(~name, scales = "free")+
  theme_bw()+
  theme(axis.text.y = element_blank(), axis.text.x.bottom = element_blank())

3.3 Continuous Inputs & Binary outcome

Q:How can you visualize the behavior of the binary outcome with respect to the continuous inputs?

A: As shown below, we can’t just draw a vertical line to divide them.

df_all %>%
  mutate(log_response = log(response)) %>%
  select(starts_with('x'), outcome) %>%
  pivot_longer(!c(outcome)) %>%
  ggplot(mapping = aes(x = value, y = outcome))+
  geom_point(mapping = aes(color = outcome),size = 0.1)+
  facet_wrap(~name, scales = "free")+
  theme_bw()+
  theme(axis.text.y = element_blank(), axis.text.x.bottom = element_blank())